In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
# import pfile.to
# import pstr.to
from bs4 import BeautifulSoup
import requests
# import pstr
import re
# import pickle
# import urllib2
# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys
import urlparse
import urllib
from datetime import datetime

In [75]:
##########
# SETTINGS
save_folder = os.path.join(os.environ['GD_FOLDER'], 'Shared/ms_otosense')
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.5 Safari/532.2'

################################################
# UTILS

html_re = re.compile('\.html$')

def url_encode(query):
    # look at urlparse for cleaner ways to do this
    return urllib.urlencode({'':query})[1:]

def hms_message(msg=''):
    t = datetime.now().time()
    return "%d:%d:%d - %s" % (t.hour, t.minute, t.second, msg)

def save_text_to_file(s, filepath):
    text_file = open(filepath, "w")
    text_file.write(s.encode('utf-8'))
    text_file.close()

def filename_from_url(url):
    return url.replace('/','§').replace(':','{') + '.html'

def url_from_filename(filename):
    return html_re.sub('', filename.replace('§','/').replace('{',':'))


# def get_dogpile_html_from_query(query):
#     dogpile_url_prefix = 'http://www.dogpile.com/search/web'
#     response = requests.get(dogpile_url_prefix, params={'q': query})
#     if response:
#         return response.text
#     else:
#         return None

# def acquire_query_result_from_dogpile(query, save_folder=os.path.join(os.environ['MS_DATA'], 'misc')):
#     html = get_dogpile_html_from_query(query)
#     if html:
#         file_name = url_encode(query) + '.html'
#         file_path = os.path.join(save_folder, file_name)
#         save_html(html, file_path)
#     else:
#         raise ValueError("There was a problem in acquiring %s" % query)
        
################################################
# SPECIFIC METHODS

dogpile_base_url = 'http://www.dogpile.com'
dogpile_search_url = '/search/web?'

def qsi_from_result_page_number(page_number):
    return page_number*10 + 1

def get_dogpile_request_url(query, result_page_number=0):
    '''
    returns a url
    '''
    first_item_number = qsi_from_result_page_number(result_page_number)
    return urlparse.urljoin(base=dogpile_base_url, 
                            url=dogpile_search_url 
                            + urllib.urlencode(query={'q': query, 'qsi': first_item_number}))


################################################
# GENERAL METHODS

def get_url_from_seed(seed):
    return get_dogpile_request_url(seed)

def get_html_from_seed(seed):
    url = get_url_from_seed(seed)
    return get_html_of_url(url)

def get_html_of_url(url):
#     headers = {'User-Agent': user_agent}
#     response = requests.get(url=url, headers=headers)
    response = requests.get(url=url)
    if response and response.ok:
        return response.text
    else:
        return None

def html_is_valid(html):
    if html:
        return True
    else:
        return False

def log_progress(msg):
    print hms_message(msg)
    
def log_error(msg):
    print hms_message('ERROR: ' + msg)

def file_path_of_slurp(slurp_spec):
    return os.path.join(save_folder, filename_from_url(slurp_spec))

def save_html_of_slurp(html, slurp_spec):
    save_text_to_file(s=html, filepath=file_path_of_slurp(slurp_spec))

In [76]:
# seed specification
seed_list = [
             '"smoke alarm" flash', '"fire alarm" flash', 'doorbell flash']
n_result_pages = 15

for i, seed in enumerate(seed_list[1:3]):
    for j, result_page in enumerate(range(n_result_pages)):
        # slurp
        url = get_dogpile_request_url(seed, result_page)
        log_progress('seed %d, %d: %s (slurping %s)' % (i, j, seed, url))
        try:
            html = get_html_of_url(url)
        except:
            log_error('seed %d, %d (%s): get_html_of_url(%s)' % (i, j, seed, url))
            continue # go to the next seed
        # process
        if html_is_valid:
            save_html_of_slurp(html, url)
        else:
            log_error('seed %d, %d (%s): html not valid: %s' % (i, j, seed, url))


14:25:33 - seed 0, 0: "fire alarm" flash (slurping http://www.dogpile.com/search/web?q=%22fire+alarm%22+flash&qsi=1)
14:25:34 - seed 0, 1: "fire alarm" flash (slurping http://www.dogpile.com/search/web?q=%22fire+alarm%22+flash&qsi=11)
14:25:35 - seed 0, 2: "fire alarm" flash (slurping http://www.dogpile.com/search/web?q=%22fire+alarm%22+flash&qsi=21)
14:25:37 - seed 1, 0: doorbell flash (slurping http://www.dogpile.com/search/web?q=doorbell+flash&qsi=1)
14:25:38 - seed 1, 1: doorbell flash (slurping http://www.dogpile.com/search/web?q=doorbell+flash&qsi=11)
14:25:39 - seed 1, 2: doorbell flash (slurping http://www.dogpile.com/search/web?q=doorbell+flash&qsi=21)

Parse dogpile


In [191]:
def get_link_from_results(results_soup):
    urlpane = results_soup.find('div', attrs={'class':'resultDisplayUrlPane'})
    return urlparse.parse_qs(urlpane.find('a', attrs='resultDisplayUrl').attrs['href'])['ru'][0]
def get_title_text_from_results(results_soup):
    return results_soup.find('div', attrs={'class':'resultTitlePane'}).get_text()
def get_description_text_from_results(results_soup):
    return results_soup.find('div', attrs={'class':'resultDescription'}).get_text()


def get_web_results_dict_from_results_soup(results_soup):
    return {
            'link':get_link_from_results(results_soup), 
            'title':get_title_text_from_results(results_soup),
            'description':get_description_text_from_results(results_soup)
            }

def parse_dogpile_html(html):
    b = BeautifulSoup(html)
    
    result_tags = ['resultsAdsTop', 'resultsMain', 'resultsAdsBottom']
    parse_dict = {k: b.find('div',attrs={'id':k}) for k in result_tags}
    
    parse_dict['resultsAdsTop'] = parse_dict['resultsAdsTop'].findAll('div',attrs={'class':'searchResult adResult'})
    parse_dict['resultsMain'] = parse_dict['resultsMain'].findAll('div',attrs={'class':'searchResult webResult'})
    parse_dict['resultsAdsBottom'] = parse_dict['resultsAdsBottom'].findAll('div',attrs={'class':'searchResult adResult'})

    parse_dict['resultsMain'] = [get_web_results_dict_from_results_soup(r) for r in parse_dict['resultsMain']]
    
    return parse_dict


def diagnose_parse_dict(parse_dict):
    print "parse_dict_keys: %s" % d.keys()
    print "number of resultsMain: %d" % len(d['resultsMain'])
    print d['resultsMain'][0]

In [192]:
d = parse_dogpile_html(html)
diagnose_parse_dict(d)


parse_dict_keys: ['resultsAdsBottom', 'resultsAdsTop', 'resultsMain']
number of resultsMain: 10
{'link': 'http://www.devices4less.com/Doorbell.html', 'description': u'Offering remote signal recievers, door knock alerts, doorbell signalers, strobe remote recievers, vibes bed vibrators, telephone signalers, loud horn', 'title': u'\nDoorbell Strobe Lights - Doorbell Signalers - Door Knock Alerting\n'}

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Comments

Demo of kwargs


In [52]:
def fun(a, b=10):
    return a / float(b)

In [55]:
print fun(2,3)
print fun(2)
print fun(a=2)
print fun(b=4, a=2)


0.666666666667
0.2
0.2
0.5

In [68]:
def fun(a, **kwargs):
    print a + '---'
    for k in kwargs.keys():
        print k
        print "--> %s" % kwargs[k]

In [69]:
fun('asdf', autre_var='booboo', and_yet_another='kiuki')


asdf---
autre_var
--> booboo
and_yet_another
--> kiuki

Demo of environment variables


In [4]:
import os

In [5]:
type(os.environ)


Out[5]:
instance

In [6]:
print os.environ.keys()


['VEN_LANG', 'VEN_PYTHONPATH', 'MON_AWS_ACCESS_KEY_ID', 'MON_AWS_SECRET_ACCESS_KEY', 'SHELL', 'COLORFGBG', 'VEN_PWD', 'VEN_Apple_PubSub_Socket_Render', 'PY_FOLDER', 'MS_DATA', 'DB_FOLDER', 'PYTHONPATH', 'VEN_GIT_PAGER', 'KHAN_SERVER', 'VEN_Apple_Ubiquity_Message', 'KHAN_CODE_FOLDER', 'LOCATION_S3', 'VEN_AWS_SECRET_ACCESS_KEY', 'VEN_PAGER', 'USER', 'PY_PACKAGES_FOLDER', 'SHLVL', 'VEN_SHELL', 'KHAN_PEM_FILE', 'VEN_VIRTUALENVWRAPPER_PROJECT_FILENAME', 'VEN_COMMAND_MODE', 'GD_FOLDER', 'MON_DNS', 'ITERM_PROFILE', 'VEN_COLORFGBG', 'TMPDIR', 'VEN_TERM_PROGRAM', 'KHAN_DATA', 'KHAN_NB_FOLDER', 'PAGER', 'COMMAND_MODE', 'VEN_S3_SECRET', 'VEN_ITERM_SESSION_ID', 'VEN_SSH_AUTH_SOCK', 'MON_PEM_FILE', 'HOME', 'VEN_DISPLAY', 'VEN_LOGNAME', 'TERM_PROGRAM', 'LANG', 'KHAN_LOG_FOLDER', 'Apple_PubSub_Socket_Render', 'VEN_ADWORDS_PASSWORD', '_', 'VEN_USER', 'VEN_PS1', 'VEN_AWS_ACCESS_KEY_ID', 'VEN_PATH', 'VEN_TMPDIR', 'GIT_PAGER', 'KHAN_PRINT_FILE', 'VEN_S3_ACCESS_KEY', 'VEN___CF_USER_TEXT_ENCODING', 'MON_SERVER', 'VEN_ADWORDS_TOKEN', 'VEN_SHLVL', '__CF_USER_TEXT_ENCODING', 'NOTEBOOK_FOLDER', 'OLDPWD', 'VEN_CLICOLOR', 'VEN_TERM', 'PY_PROJ_FOLDER', 'LOCATION_LOCAL', 'LOGNAME', 'DATA_LOG_FOLDER', 'PATH', 'CLICOLOR', 'TERM', 'ITERM_SESSION_ID', 'KHAN_DNS', 'DEV_FOLDER', 'SSH_AUTH_SOCK', 'VEN_OLDPWD', 'VEN_ADWORDS_EMAIL_PASSWORD', 'OMS_DATA', 'DISPLAY', 'VEN_ITERM_PROFILE', 'VEN_HOME', 'Apple_Ubiquity_Message', 'PWD', 'AWS_CREDENTIAL_FILE']

In [7]:
os.environ['GD_FOLDER']


Out[7]:
'/Users/thor/Google Drive'

Scrap


In [ ]:


In [ ]: